# Read uspto dataset
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
usptodata = pd.read_csv('U.S. Patents.csv')
usptodataset=usptodata[["grant_id","claims_text","abstract"]]
usptodataset= usptodataset.dropna()
usptodataset = usptodataset.reset_index(drop=True)
#Tokenize
usptodataset['tokenized_claims_text'] = usptodataset.apply(lambda row: nltk.word_tokenize(row['claims_text']), axis=1)
usptodataset.head()
#split data to have little data to run
# divide dataset to train and test
df_train, df_test = train_test_split(usptodataset, test_size=0.8, random_state=25)
df_train=df_train.reset_index(drop=True)
df_train
| grant_id | claims_text | abstract | tokenized_claims_text | |
|---|---|---|---|---|
| 0 | US10459019 | 1. An electromagnetic sensor comprising:,a fir... | An electromagnetic sensor includes a first mag... | [1, ., An, electromagnetic, sensor, comprising... |
| 1 | US10456083 | 1. A method for mapping somatosensory and moto... | An apparatus for cortical mapping and method f... | [1, ., A, method, for, mapping, somatosensory,... |
| 2 | US10461549 | 1. A method for charging a mobile terminal, th... | The disclosure discloses a mobile terminal, a ... | [1, ., A, method, for, charging, a, mobile, te... |
| 3 | US10462815 | 1. A method for a User Equipment (UE) operatin... | The present invention relates to a wireless co... | [1, ., A, method, for, a, User, Equipment, (, ... |
| 4 | US10458026 | 1. A method of producing graphene sheets compr... | A method of producing graphene sheets comprisi... | [1, ., A, method, of, producing, graphene, she... |
| ... | ... | ... | ... | ... |
| 1397 | US10458777 | 1. A method of measuring a metrology target el... | Targets, target elements and target design met... | [1, ., A, method, of, measuring, a, metrology,... |
| 1398 | US10458022 | 1. A method for anti-corrosive treatment of me... | A method for corrosion protection treatment, c... | [1, ., A, method, for, anti-corrosive, treatme... |
| 1399 | US10462550 | 1. A storage device comprising:,a first case c... | A storage device includes a first case, a seco... | [1, ., A, storage, device, comprising, :, ,a, ... |
| 1400 | US10456037 | 1. A terminal device configured to be able to ... | A terminal device is provided which is configu... | [1, ., A, terminal, device, configured, to, be... |
| 1401 | US10461138 | 1. An organic light-emitting display device, c... | An organic light-emitting display device and a... | [1, ., An, organic, light-emitting, display, d... |
1402 rows × 4 columns
dictionary = list(df_train.loc[:, "tokenized_claims_text"].values)
#dictionary[:2]
docs = list(df_train.loc[:, "claims_text"].values)
type(docs)
list
#LDA with Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import en_core_web_sm
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# LDA with octis
from octis.models.LDA import LDA
docs = list(df_train.loc[:, "claims_text"].values)
allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]
def lemmatize(docs, allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]):
nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
lemmatized_docs = []
for doc in docs:
doc = nlp(doc)
tokens = []
for token in doc:
if token.pos_ in allowed_postags:
tokens.append(token.lemma_)
lemmatized_docs.append(" ".join(tokens))
return (lemmatized_docs)
def tokenize(docs):
tokenized_docs = []
for doc in docs:
tokens = gensim.utils.simple_preprocess(doc, deacc=True)
tokenized_docs.append(tokens)
return (tokenized_docs)
# Pre-process input: lemmatization and tokenization
lemmatized_docs = lemmatize(docs)
tokenized_docs = tokenize(lemmatized_docs)
# Mapping from word IDs to words
id2word = corpora.Dictionary(tokenized_docs)
# Prepare Document-Term Matrix
corpus = []
for doc in tokenized_docs:
corpus.append(id2word.doc2bow(doc))
# Fit LDA model:
lda_model = gensim.models.ldamodel.LdaModel(
corpus = corpus, # Document-Term Matrix
id2word = id2word, # Map word IDs to words
num_topics = 30, # Number of latent topics to extract
random_state = 100,
passes = 100, # N° of passes through the corpus during training
)
# Visualize with pyLDAvis:
pyLDAvis.enable_notebook()
visualization = pyLDAvis.gensim_models.prepare(
lda_model,
corpus,
id2word,
mds = "mmds",
R = 30)
visualization
C:\ProgramData\Anaconda3\lib\site-packages\pyLDAvis\_prepare.py:246: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only default_term_info = default_term_info.sort_values(
#!pip3 uninstall https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
lda_topics = topic_model.show_topics(num_words=5)
topics = []
filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]
for topic in lda_topics:
print(topic)
topics.append(preprocess_string(topic[1], filters))
print('----------------------------------------------------------------------------------')
[print(a) for a in topics];
(24, '0.085*"power" + 0.041*"charge" + 0.029*"claim" + 0.019*"mobile" + 0.018*"battery"') (25, '0.050*"claim" + 0.032*"method" + 0.031*"comprise" + 0.030*"group" + 0.023*"composition"') (6, '0.250*"say" + 0.037*"claim" + 0.025*"comprise" + 0.021*"method" + 0.017*"said"') (22, '0.041*"location" + 0.025*"more" + 0.020*"claim" + 0.020*"base" + 0.020*"segment"') (14, '0.132*"light" + 0.050*"device" + 0.043*"emit" + 0.021*"claim" + 0.018*"plurality"') (20, '0.043*"gas" + 0.035*"engine" + 0.030*"claim" + 0.022*"method" + 0.021*"segment"') (4, '0.071*"object" + 0.058*"area" + 0.045*"display" + 0.025*"datum" + 0.021*"point"') (16, '0.068*"device" + 0.031*"datum" + 0.024*"claim" + 0.024*"network" + 0.021*"communication"') (23, '0.038*"end" + 0.034*"member" + 0.026*"claim" + 0.024*"portion" + 0.021*"least"') (11, '0.076*"least" + 0.062*"at" + 0.024*"ray" + 0.016*"claim" + 0.014*"device"') ---------------------------------------------------------------------------------- ['power', 'charge', 'claim', 'mobile', 'battery'] ['claim', 'method', 'comprise', 'group', 'composition'] ['say', 'claim', 'comprise', 'method', 'said'] ['location', 'more', 'claim', 'base', 'segment'] ['light', 'device', 'emit', 'claim', 'plurality'] ['gas', 'engine', 'claim', 'method', 'segment'] ['object', 'area', 'display', 'datum', 'point'] ['device', 'datum', 'claim', 'network', 'communication'] ['end', 'member', 'claim', 'portion', 'least'] ['least', 'at', 'ray', 'claim', 'device']
######################## NMF #################################
warnings.filterwarnings("ignore", category=DeprecationWarning)
allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]
docs = docs = list(df_train.loc[:, "claims_text"].values)
def lemmatize(docs, allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]):
nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
lemmatized_docs = []
for doc in docs:
doc = nlp(doc)
tokens = []
for token in doc:
if token.pos_ in allowed_postags:
tokens.append(token.lemma_)
lemmatized_docs.append(" ".join(tokens))
return (lemmatized_docs)
def tokenize(docs):
tokenized_docs = []
for doc in docs:
tokens = gensim.utils.simple_preprocess(doc, deacc=True)
tokenized_docs.append(tokens)
return (tokenized_docs)
# Pre-process input: lemmatization and tokenization
lemmatized_docs = lemmatize(docs)
tokenized_docs = tokenize(lemmatized_docs)
# Mapping from word IDs to words
id2word = corpora.Dictionary(tokenized_docs)
# Prepare Document-Term Matrix
corpus = []
for doc in tokenized_docs:
corpus.append(id2word.doc2bow(doc))
# Fit NMF model: See [1] for more details
nmf_model = gensim.models.Nmf(
corpus = corpus, # Document-Term Matrix
id2word = id2word, # Map word IDs to words
num_topics = 30, # Number of latent topics to extract
random_state = 100,
passes = 100, # N° of passes through the corpus during training
)
# Get the topics sorted by sparsity
#nmf_model.show_topics()
nmf_topics = nmf_model.show_topics(num_words=5)
topics = []
filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]
for topic in nmf_topics:
print(topic)
topics.append(preprocess_string(topic[1], filters))
print('----------------------------------------------------------------------------------')
[print(a) for a in topics];
(18, '0.040*"claim" + 0.019*"surface" + 0.016*"comprise" + 0.015*"position" + 0.013*"member"') (24, '0.149*"carbonyl" + 0.120*"phenyl" + 0.056*"urea" + 0.053*"difluoro" + 0.049*"substitute"') (5, '0.105*"layer" + 0.029*"claim" + 0.022*"material" + 0.020*"electrode" + 0.019*"conductive"') (6, '0.054*"display" + 0.036*"more" + 0.028*"object" + 0.022*"claim" + 0.021*"input"') (19, '0.041*"channel" + 0.037*"connect" + 0.031*"gate" + 0.024*"zone" + 0.023*"wireless"') (8, '0.095*"light" + 0.078*"image" + 0.044*"generation" + 0.044*"range" + 0.035*"execution"') (3, '0.104*"side" + 0.096*"transistor" + 0.075*"output" + 0.064*"couple" + 0.059*"high"') (29, '0.081*"terminal" + 0.069*"switch" + 0.048*"controllable" + 0.042*"circuit" + 0.041*"first"') (11, '0.100*"memory" + 0.078*"say" + 0.039*"copy" + 0.031*"unit" + 0.030*"datum"') (26, '0.073*"say" + 0.048*"module" + 0.038*"camera" + 0.033*"board" + 0.033*"circuit"') ---------------------------------------------------------------------------------- ['claim', 'surface', 'comprise', 'position', 'member'] ['carbonyl', 'phenyl', 'urea', 'difluoro', 'substitute'] ['layer', 'claim', 'material', 'electrode', 'conductive'] ['display', 'more', 'object', 'claim', 'input'] ['channel', 'connect', 'gate', 'zone', 'wireless'] ['light', 'image', 'generation', 'range', 'execution'] ['side', 'transistor', 'output', 'couple', 'high'] ['terminal', 'switch', 'controllable', 'circuit', 'first'] ['memory', 'say', 'copy', 'unit', 'datum'] ['say', 'module', 'camera', 'board', 'circuit']
##################### BERTOPIC ###################################
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
# Fetch 20newsgropus dataset
docs = docs = list(df_train.loc[:, "claims_text"].values)
# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Clustering model: See [2] for more details
cluster_model = HDBSCAN(min_cluster_size = 25,
metric = 'euclidean',
cluster_selection_method = 'eom',
prediction_data = True)
# BERTopic model
Bertopic_model = BERTopic(embedding_model = embedding_model,
hdbscan_model = cluster_model)
# Fit the model on a corpus
topics, probs = Bertopic_model.fit_transform(docs)
# Visualization examples
# Save intertopic distance map as HTML file
#topic_model.visualize_topics().write_html("/intertopic_dist_map.html")
# Save topic-terms barcharts as HTML file
#topic_model.visualize_barchart(top_n_topics = 25).write_html("/barchart.html")
# Save documents projection as HTML file
#topic_model.visualize_documents(docs).write_html("/projections.html")
# Save topics dendrogram as HTML file
#topic_model.visualize_hierarchy().write_html("/hieararchy.html")
#intertopic distance map
Bertopic_model.visualize_topics()
# topic-terms barcharts
#Bertopic_model.visualize_barchart(top_n_topics = 25)
#documents projection
Bertopic_model.visualize_documents(docs)
# topics dendrogram
Bertopic_model.visualize_hierarchy()
######################### Top2Vec###################################
from top2vec import Top2Vec
from sklearn.datasets import fetch_20newsgroups
docs = docs = list(df_train.loc[:, "claims_text"].values)
# Create jointly embedded topic, document and word vectors
Top2Vec_model = Top2Vec(
docs,
embedding_model = 'doc2vec', # Embedding model: See [1,2] for supported models
min_count = 50, # Ignore words less frequent than this value
umap_args = None, # Dict of custom args for UMAP
hdbscan_args = None # Dict of custom argd for HDBSCAN
)
# Visualization examples: See [1,2] for more details
# Search the closest 5 topics to the input query "faith"
# topic_words, word_scores, topic_scores, topic_nums = Top2Vec_model.search_topics(
# keywords = ["faith"],
# num_topics = 5)
# Plot the resulting topics as wordclouds
# for topic in topic_nums:
# topic_model.generate_topic_wordcloud(topic)
2022-12-26 09:47:17,188 - top2vec - INFO - Pre-processing documents for training 2022-12-26 09:47:20,088 - top2vec - INFO - Creating joint document/word embedding 2022-12-26 09:48:38,684 - top2vec - INFO - Creating lower dimension embedding of documents 2022-12-26 09:48:42,781 - top2vec - INFO - Finding dense areas of documents 2022-12-26 09:48:42,830 - top2vec - INFO - Finding topics
topic_words, word_scores, topic_nums = Top2Vec_model.get_topics()
print(topic_words)
[['extending' 'extends' 'longitudinal' 'engaging' 'distal' 'axial' 'body' 'radially' 'outward' 'spaced' 'away' 'rotate' 'opening' 'circumference' 'axially' 'extend' 'end' 'inward' 'sides' 'outer' 'outwardly' 'wall' 'flange' 'threaded' 'assembly' 'mounting' 'spring' 'annular' 'engage' 'inwardly' 'housing' 'circumferential' 'engages' 'integrally' 'along' 'shoulder' 'inner' 'bottom' 'apart' 'rear' 'positioned' 'formed' 'rotatably' 'therebetween' 'fastener' 'disposed' 'inlet' 'aperture' 'locking' 'outlet'] ['service' 'readable' 'computer' 'executed' 'request' 'private' 'security' 'transitory' 'instructions' 'application' 'transaction' 'session' 'entity' 'client' 'protocol' 'executable' 'key' 'network' 'message' 'program' 'blockchain' 'services' 'credentials' 'transactions' 'server' 'code' 'cryptographic' 'user' 'text' 'software' 'proxy' 'account' 'token' 'file' 'identity' 'implemented' 'requesting' 'encrypted' 'policy' 'operations' 'record' 'editing' 'applications' 'computing' 'certificate' 'processor' 'cause' 'processors' 'public' 'executing'] ['layer' 'insulating' 'semiconductor' 'dielectric' 'doped' 'substrate' 'etching' 'bonding' 'nitride' 'exposed' 'conductive' 'conformal' 'metal' 'material' 'film' 'etch' 'solder' 'silicon' 'manufacturing' 'thickness' 'disposed' 'surface' 'formed' 'electrode' 'sidewalls' 'insulation' 'wafer' 'covered' 'surfaces' 'fin' 'top' 'adhesive' 'emitting' 'electrically' 'gate' 'liner' 'depositing' 'wiring' 'covers' 'mask' 'protruding' 'middle' 'copper' 'bottom' 'thin' 'stress' 'metallic' 'package' 'sidewall' 'structure'] ['voltage' 'circuit' 'amplifier' 'capacitor' 'comparator' 'switch' 'signal' 'output' 'analog' 'converter' 'resistor' 'transistor' 'oscillator' 'clock' 'inverter' 'outputs' 'input' 'switching' 'capacitance' 'voltages' 'differential' 'transistors' 'inductor' 'reset' 'integration' 'power' 'ac' 'capacitors' 'control' 'switches' 'drain' 'cascode' 'switched' 'gain' 'alternating' 'cycle' 'pulse' 'sampling' 'controllable' 'coupled' 'stage' 'current' 'impedance' 'fast' 'polarity' 'oscillation' 'rf' 'gate' 'nand' 'logic'] ['composition' 'solvent' 'alkyl' 'carbon' 'atoms' 'formula' 'nh' 'polymer' 'copolymer' 'methyl' 'hydrocarbon' 'monomer' 'alkenyl' 'phenyl' 'hydrogen' 'aqueous' 'substituted' 'acid' 'aryl' 'urea' 'acrylate' 'catalyst' 'xb' 'ch' 'unsubstituted' 'soluble' 'combinations' 'calcium' 'salt' 'iron' 'compound' 'heteroaryl' 'titanium' 'mg' 'carbonyl' 'reaction' 'consisting' 'treated' 'meth' 'resistant' 'yl' 'wt' 'particles' 'coating' 'difluorophenyl' 'solid' 'independently' 'branched' 'containing' 'mixture'] ['viewpoint' 'image' 'sight' 'boundary' 'watermark' 'acquired' 'captured' 'acquiring' 'object' 'images' 'interest' 'obtained' 'values' 'coordinates' 'swathe' 'represented' 'differences' 'capturing' 'maps' 'difference' 'pixel' 'executes' 'option' 'correcting' 'variation' 'representation' 'projected' 'value' 'displaying' 'frames' 'pixels' 'imaging' 'region' 'distances' 'printing' 'color' 'similarity' 'processing' 'corrected' 'rough' 'extraction' 'luminance' 'scale' 'blue' 'correlation' 'acquire' 'points' 'colors' 'gesture' 'projecting'] ['trajectory' 'vehicle' 'speed' 'road' 'demand' 'velocity' 'acceleration' 'wheels' 'torque' 'motor' 'brake' 'propulsion' 'vehicles' 'lane' 'wheel' 'load' 'actuators' 'prescribed' 'actual' 'condition' 'movement' 'steering' 'electronics' 'autonomous' 'angular' 'safety' 'travel' 'actuator' 'estimated' 'plunger' 'drive' 'moving' 'changed' 'gradient' 'golf' 'established' 'changing' 'estimate' 'flight' 'acquired' 'calculated' 'duration' 'change' 'efficiency' 'detected' 'increase' 'detects' 'control' 'calculate' 'situation'] ['lens' 'refractive' 'focal' 'optical' 'incident' 'light' 'reflected' 'wavelength' 'polarization' 'irradiation' 'beam' 'illumination' 'crystal' 'emitted' 'nm' 'mirror' 'projection' 'convex' 'imaging' 'infrared' 'reflection' 'intensity' 'curvature' 'rays' 'imager' 'shape' 'concave' 'radius' 'index' 'particle' 'view' 'absorber' 'image' 'contrast' 'perpendicular' 'grating' 'laser' 'plane' 'spot' 'cladding' 'surface' 'angle' 'visible' 'sequentially' 'ray' 'satisfied' 'objective' 'capturing' 'detector' 'opaque'] ['administering' 'seq' 'amino' 'acid' 'nucleic' 'cancer' 'polypeptide' 'peptide' 'pharmaceutical' 'acids' 'treating' 'protein' 'fragment' 'no' 'car' 'antibody' 'disease' 'acceptable' 'id' 'need' 'composition' 'dna' 'effective' 'gene' 'il' 'salt' 'expression' 'consisting' 'variant' 'mg' 'human' 'subject' 'domain' 'cells' 'yl' 'administration' 'bb' 'concentration' 'thereof' 'methyl' 'binding' 'dose' 'isolated' 'tissue' 'compound' 'carbohydrate' 'sequence' 'risk' 'substituted' 'formula'] ['decoding' 'decoded' 'decoder' 'encoder' 'decode' 'symbol' 'symbols' 'vector' 'bits' 'transform' 'prediction' 'vectors' 'encoded' 'encoding' 'values' 'chroma' 'candidate' 'inter' 'format' 'video' 'code' 'word' 'register' 'block' 'coefficient' 'value' 'stream' 'frames' 'rs' 'context' 'significant' 'instruction' 'number' 'representation' 'adaptive' 'indicate' 'bit' 'calculate' 'coefficients' 'representing' 'pixels' 'integer' 'numeric' 'previously' 'scheme' 'calculating' 'mapping' 'language' 'transitory' 'using'] ['downlink' 'ue' 'uplink' 'subframe' 'rrc' 'tdd' 'enb' 'station' 'csi' 'signaling' 'resources' 'resource' 'ues' 'bs' 'configurations' 'pucch' 'srs' 'periodicity' 'subcarriers' 'transmission' 'mac' 'rs' 'broadcast' 'bandwidth' 'serving' 'configuration' 'radio' 'allocation' 'interference' 'transmit' 'transmitting' 'transceiver' 'wireless' 'transmitted' 'message' 'shared' 'base' 'procedure' 'priority' 'network' 'duplex' 'information' 'physical' 'indicated' 'access' 'channel' 'symbol' 'stations' 'equipment' 'indication']]